# -*- encoding: utf-8 -*-
"""
@Date    :   2024/01/24 15:11:27
@Author  :   orange-crow
@File    :   crawl_webpage.py
"""

from metagpt.roles.di.data_interpreter import DataInterpreter

# PAPER_LIST_REQ = """"
# Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
# and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key variables*
# """

# ECOMMERCE_REQ = """
# Get products data from website https://scrapeme.live/shop/ and save it as a csv file.
# **Notice: Firstly parse the web page encoding and the text HTML structure;
# The first page product name, price, product URL, and image URL must be saved in the csv;**
# """

WEB_CRAWL_PROMPT = """
Extract all exam questions and answers from https://www.gkzenti.cn/paper/1600261871781mv9 and format them according to the database schema. Follow these steps:

1. Use requests and BeautifulSoup to fetch and parse the webpage
2. Identify and iterate through all question blocks
3. For each question extract:
   - id: Concatenate paper_id + "_" + question_number
   - topic_id: Extract from URL pattern
   - type: Map to 'single'/'multiple'/'essay' based on options
   - content: Full question text without options
   - options: JSON array of options (e.g., ["A. Option1", "B. Option2"])
   - answer: Correct option letter(s)
   - analysis: Detailed solution explanation
   - difficulty: 1-5 scale estimate
   - tags: Subject categories from page context
   - source: "gkzenti.cn"
   - year: Extract from paper title

Output format:
json
[
{
"id": "1600261871781mv9_1",
"topic_id": "1600261871781mv9",
"type": "single",
"content": "Which programming language...",
"options": ["A. Python", "B. Java", ...],
"answer": "A",
"analysis": "Python is interpreted...",
"difficulty": 2,
"tags": "programming, basic",
"source": "gkzenti.cn",
"year": 2023,
"knowledge_points": "Language features",
"category": "Computer Science"
},

]
Requirements:
1. Ensure all timestamp fields use: (strftime('%s', 'now') * 1000)
2. Handle special characters in content/text
3. Add retry logic for failed requests
4. Verify answer letters match options
5. Convert markdown formatting to plain text
6. Include error handling for missing elements
"""

async def main():
    di = DataInterpreter(tools=["scrape_web_playwright"])

    await di.run(WEB_CRAWL_PROMPT)


if __name__ == "__main__":
    import asyncio

    asyncio.run(main())
